# coding:utf8
import requests
from utils import kill_captcha
import time
from bs4 import BeautifulSoup
import re
import json
import table
import pymongo
from pymongo import MongoClient
import HTMLParser
import sys

path = sys.path[0]


def time_time(time):  # 时间函数
    time = time.replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '')
    list1 = []
    if time:
        time1 = re.findall("\d+", time)
        for i in time1:
            if len(i) < 2:
                j = "0" + i
                list1.append(j)
            else:
                list1.append(i)
        time2 = '-'.join(list1) + " 00:00:00"
        return time2
    else:
        return ''


def try_number(num):
    def _func(func):
        def __func(*args, **kwargs):
            number = 0
            while 1:
                try:
                    print 'hello'
                    return func(*args, **kwargs)
                except Exception, e:
                    print e
                    info = sys.exc_info()
                    print info[0], ":", info[1]
                    print number
                    if number == num:
                        break
                    continue

        return __func

    return _func


def reclean(text, html):
    try:
        message = re.findall(text, html)[0].replace('>', '')
        return message
    except:
        return ''


def html_clean(text):
    text = str(text)
    text = re.sub('<[\s\S]+?>', '', text).replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
    return text


def get_image(image):  # 破解图片验证码函数
    return kill_captcha(image, 'sc', 'jpeg')


@try_number(100)
def get_html_list(word):
    head = {'Host': 'gsxt.scaic.gov.cn', 'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Googlebot/2.1 (+http://www.googlebot.com/bot.html)', 'Origin': 'http://gsxt.scaic.gov.cn'}
    s = requests.Session()
    word = word.decode('utf8').encode('gbk')
    l = int(time.time() * 1000)
    html = s.get('http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=%s' % l, headers=head, timeout=20).content
    while 1:
        num = 0
        try:
            image = s.get('http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm&dt=%s&random=%s' % (l, l),
                          headers=head).content
        except:
            print '图片地址拒绝访问'

        yzm = get_image(image)
        data = {'currentPageNo': '1', 'yzm': '%s' % yzm, 'cxym': 'cxlist', 'maent.entname': '%s' % word}
        html = s.post('http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=%s' % l, data=data, headers=head,
                      timeout=20).content
        result = re.findall("var flag = '(.*?)'", html)
        print result
        # try:
        if result:
            print result[0]
            if not result[0]:
                print 'hello'
                return html.decode('gbk').encode('utf8')
                # break
            if num >= 10:
                raise TIMEOUTERROR
                break
            # except:
            #    print html
            num += 1
    return html.decode('gbk').encode('utf8')


def year_port(html_2014, year):
    soup = BeautifulSoup(html_2014, "html.parser")
    html = html_2014.replace('\r', '').replace('\n', '').replace(' ', '').replace('\t', '')
    open(path + '/a.html', 'w').write(html)
    regNo = reclean('<thwidth="20%.+?>注册号</th><tdwidth="30%">(.+?)</td>', html)
    if not regNo:
        regNo = reclean('<thwidth="20%.+?>注册号</th><tdwidth="20%">(.+?)</td>', html)
    phone = reclean('<thwidth="20%.+?>企业联系电话</th><tdwidth="30%"(.+?)</td>', html)
    email = reclean('<thwidth="20%.+?>电子邮箱</th><tdwidth="30%"(.+?)</td>', html)
    zipcode = reclean('<thwidth="20%.+?>邮政编码</th><tdwidth="30%"(.+?)</td>', html)
    enterpriseStatus = reclean('<thwidth="20%.+?>企业经营状态</th><tdwidth="30%"(.+?)</td>', html)
    haveWebsite = reclean('<thwidth="20%.+?>是否有网站或网店</th><tdwidth="30%"(.+?)</td>', html)
    buyEquity = reclean('<thwidth="20%.+?>企业是否有投资信息或购买其他公司股权</th><tdwidth="30%"(.+?)</td>', html)
    equityTransfer = reclean('<thwidth="20%.+?>有限责任公司本年度是否发生股东股权转让</th><tdwidth="30%"(.+?)</td>', html)
    address = reclean('<thwidth="20%.+?>企业通信地址</th><t.+?>(.+?)</td>', html)
    employeeCount = reclean('<thwidth="20%.+?>从业人数</th><tdwidth="30%">(.+?)</td>', html)
    baseInfo = '{"regNo":"%s","phone":"%s","email":"%s","zipcode":"%s","enterpriseStatus":"%s","haveWebsite":"%s","buyEquity":"%s","equityTransfer":"%s","address":"%s","employeeCount":"%s"}' % (
        regNo, phone, email, zipcode, enterpriseStatus, haveWebsite, buyEquity, equityTransfer, address, employeeCount)
    if '网站或网店信息' in html:
        web = re.findall('<trid="tr_wzxx_1"name="wzxx">(.+?)</tr>', html)
        if bool(web) == False:
            website = '{"type":"","name":"","link":""}'
        else:
            for i in range(len(web)):
                web1 = re.findall('<td>(.+?)</td>', web[i])
                website = '{"type":"%s","name":"%s","link":"%s"}' % (web1[0], web1[1], web1[2])
    else:
        website = '{"type":"","name":"","link":""}'
    # print website
    investnum = re.findall('<tr id="tr_tzrxx_\d+" name="tzrxx">([\s\S]+?)</tr>', html_2014)
    num = len(investnum)
    str2 = ''
    for i in range(num):
        investnum1 = investnum[i].replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '')
        # print investnum
        # shareholderName1=re.findall('<tdstyle="text-align:left;">(.+?)</td>',investnum[i])
        shareholderName = get_value('<divstyle="width:100px;">(.+?)</div>', investnum1)
        subConam = get_value('<listyle="text-align:left;">(.+?)&nbsp;</li>', investnum1)
        try:
            subConDate = time_time(re.findall('<listyle="text-align:center;">(\d+年\d+月\d+日)</li>', investnum1)[0])
        except:
            subConDate = ''
        try:
            subConType = re.findall('<lititle=".+?"style="text-align:left;">(.+?)</li>', investnum1)[0]
        except:
            subConType = ''
        try:
            paidType = re.findall('<lititle=".+?"style="text-align:left;">(.+?)</li>', investnum1)[1]
        except:
            paidType = ''

        paidConMoney = get_value('<listyle="text-align:left;">(.+?)&nbsp;</li>', investnum1)
        try:
            paidTime = time_time(re.findall('<listyle="text-align:center;">(\d+年\d+月\d+日)</li>', investnum1)[1])
        except:
            paidTime = ''

        # ID=re.findall('''<ahref="###"onclick="showRyxx\('(.+?)','(.+?)'\)"''',investnum[i])[0]
        # data_data={'method':'tzrCzxxDetial','maent.xh':'%s'%ID[0],'maent.pripid':'%s'%ID[1],"random":"%s"%l}
        # html_html=curl('http://gsxt.scaic.gov.cn/ztxy.do',data_data)
        # open(path+'/html_html.html','w').write(html_html)
        # regCapCur=get_value('<li style="text-align: left;vertical-align: middle;">&nbsp;(.+?)</li>',html_html).decode('gbk')
        # print regCapCur
        # subConam=get_value('<li style="text-align: right;vertical-align: middle;">(.+?)&nbsp;</li>',html_html)
        # print subConam
        # conDate=get_value('<li>(\d.+?)</li>',html_html)

        detail = '{"shareholderName":"%s","subConam":"%s","subConDate":"%s","subConType":"%s","paidConMoney":"%s","paidTime":"%s","paidType":"%s"}' % (
            shareholderName, subConam, subConDate, subConType, paidConMoney, paidTime, paidType)
        str2 = detail + ',' + str2
    investorInformations = '[%s]' % str2.rstrip('\n').rstrip(',')
    # print soup
    try:
        assetsInfotest = soup.findAll('table', {"class": "detailsList"})[3]
        assetsInfo1 = assetsInfotest.findAll('td')
        # print assetsInfo1
        generalAssets = assetsInfo1[0].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        # print generalAssets
        ownersEequity = assetsInfo1[1].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        # print ownersEequity
        revenue = assetsInfo1[2].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        # print revenue
        profit = assetsInfo1[3].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        mainRevenue = assetsInfo1[4].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        netProfit = assetsInfo1[5].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        taxPayment = assetsInfo1[6].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        liability = assetsInfo1[7].string.replace(' ', '').replace('\t', '').replace('\n', '').replace('\r', '')
        assets_Info = '{"generalAssets":"%s", "ownersEequity":"%s", "revenue":"%s","profit":"%s","mainRevenue":"%s","netProfit":"%s","taxPayment":"%s","liability":"%s"}' % (
        generalAssets, ownersEequity, revenue, profit, mainRevenue, netProfit, taxPayment, taxPayment)
        # assets_Info = json.dumps(assets_Info,ensure_ascii=False).replace('\r', '')
        assets_Info = str(assets_Info).replace('\r', '')
    except IndexError, e:
        assets_Info = "{}"


    # print assets_Info
    # time.sleep(60)
    strr2 = ''
    for i in soup.findAll('tr', {"name": "bgxx"}):
        jj = i.findAll('td')
        if u'收起更多' in html_clean(jj[3]):
            afterChange1 = html_clean(jj[3]).split(u'更多')
            afterChange = afterChange1[1]
        else:
            afterChange = html_clean(jj[3])
        strr1 = {"changedItem": html_clean(jj[1]), "beforeChange": html_clean(jj[2]), "afterChange": afterChange,
                 "time": html_clean(jj[4])}
        strr1 = str(strr1)
        # print strr1
        strr2 = strr1 + ',' + strr2
    changeRecords = '[%s]' % strr2.rstrip(',')
    # html
    gqbg = re.findall('<trname="gqbg"id="tr_gqbg_\d">(.+?)</tr>', html)
    str_str1 = ''
    for i in gqbg:
        shareholderName = re.findall('<tdstyle="text-align:left;">(.*?)</td>', i)[0]
        equityBefore = re.findall('<tdstyle="text-align:left;">(.*?)</td>', i)[1]
        equityAfter = re.findall('<tdstyle="text-align:left;">(.*?)</td>', i)[2]
        time = time_time(re.findall('<tdstyle="text-align:center;">(.*?)</td>', i)[0])
        str2 = '{"shareholderName":"%s","equityAfter":"%s","equityBefore":"%s","time":"%s"}' % (
        shareholderName, equityAfter, equityBefore, time)
        str_str1 = str2 + ',' + str_str1
    try:
        equityChangeInformations = "[%s]" % str_str1.rstrip(',')
    except:
        equityChangeInformations = "[]"
    try:
        entinv = re.findall(u'对外投资信息</th></tr>([\s\S]+?)</table>', html)[0]
        enlist = re.findall('<trname="tzxx"id="tr_tzxx_\d">[\s\S]+?</tr>', entinv)
        enlist_str = ''
        for i in enlist:
            print i
            result = re.findall('<tdstyle="text-align:left;">(.+?)</td>', i)
            entName = result[0]
            regNo = result[1]
            str1 = '{"entName":"%s","entType":"","fundedRatio":"","currency":"","entStatus":"","canDate":"","esDate":"","regOrg":"","regCapcur":"","regCap":"","revDate":"","name":"","subConam":"","regNo":"%s"}' % (
            entName, regNo)
            enlist_str = str1 + ',' + enlist_str
        entinvItemList = '[%s]' % enlist_str.rstrip(',')
    except IndexError:
        entinvItemList = '[]'
    print entinvItemList
    yearReportListstr1 = '{"year":"%s","entinvItemList":%s,"baseInfo":%s,"website":%s,"investorInformations":%s,"assetsInfo":%s,"equityChangeInformations":%s,"changeRecords":%s}' % (
        year, entinvItemList, baseInfo, website, investorInformations, assets_Info, equityChangeInformations,
        changeRecords)
    return yearReportListstr1


# @try_number(10)
def get_source(html, word):
    s = requests.Session()
    l = int(time.time() * 1000)
    head = {'Host': 'gsxt.scaic.gov.cn', 'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Googlebot/2.1 (+http://www.googlebot.com/bot.html)', 'Origin': 'http://gsxt.scaic.gov.cn'}
    print html
    companyName = re.search('''<a href="javascript:void\(0\);" onclick=".+?" >(.+?)</a>''', html).group(1)

    keyword = word
    do_data = re.findall('<a href="javascript:void\(0\);" onclick="openView\((.+?)\)"', html)[0]
    sdata = do_data.split(',')
    invdata = {'method': 'qyInfo', 'djjg': '', 'maent.pripid': '%s' % sdata[0].replace("'", ""),
               'maent.entbigtype': '%s' % sdata[1].replace("'", ""), 'random': '%s' % l}
    html_1 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=invdata).content.decode('gbk').encode('utf8')
    text = html_1.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')
    pripid = re.findall("sfgsInfo&maent\.pripid=(.+?)&", text)[0]

    data_data = {'method': 'baInfo', 'maent.pripid': '%s' % pripid, 'czmk': 'czmk2', 'random': '%s' % l}
    html_2 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_data).content.decode('gbk').encode(
        'utf8')

    data_3 = {'method': 'dcdyInfo', 'maent.pripid': '%s' % pripid, 'czmk': 'czmk4', 'random': '%s' % l}
    html_3 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_3).content.decode('gbk').encode('utf8')

    data_4 = {'method': 'gqczxxInfo', 'maent.pripid': pripid, 'czmk': 'czmk4', 'random': '%s' % l}
    html_4 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_4, timeout=10).content.decode(
        'gbk').encode('utf8')

    data_5 = {'method': 'cfInfo', 'maent.pripid': pripid, 'czmk': 'czmk3', 'random': '%s' % l}
    html_5 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_5).content.decode('gbk').encode('utf8')

    data_6 = {'method': 'jyycInfo', 'maent.pripid': pripid, 'czmk': 'czmk6', 'random': '%s' % l}
    html_6 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_6).content.decode('gbk').encode('utf8')

    data_7 = {'method': 'yzwfInfo', 'maent.pripid': pripid, 'czmk': 'czmk14', 'random': l}
    html_7 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_7).content.decode('gbk').encode('utf8')

    data_8 = {'method': 'ccjcInfo', 'maent.pripid': pripid, 'czmk': 'czmk7', 'random': l}
    html_8 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_8).content.decode('gbk').encode('utf8')

    all_html = '%s%s%s%s%s%s%s%s' % (html_1, html_2, html_3, html_4, html_5, html_6, html_7, html_8)
    l = int(time.time() * 1000)
    year_data = {'method': 'qygsInfo', 'maent.pripid': '%s' % pripid, 'czmk': 'czmk8', 'random': '%s' % l}
    year_html = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=year_data).content.decode('gbk').encode(
        'utf8')

    report_year_list = re.findall('''onclick="doNdbg\('(.+?)'\)''', year_html)
    if report_year_list == '':
        yearList = []
        report_year = []

    else:
        yearList = []
        report_year = []
        for i in report_year_list:
            yearnum = i
            data_2 = {"method": "ndbgDetail", "maent.pripid": "%s" % pripid, "maent.nd": i, "random": "%s" % l}
            html_2014 = s.post('http://gsxt.scaic.gov.cn/ztxy.do', headers=head, data=data_2).content.decode(
                'gbk').encode('utf8')
            year_source = year_port(html_2014, yearnum)
            report_year.append(year_source)
            year_dic = {"year": yearnum, "html": html_2014}
            yearList.append(year_dic)
    # json.dumps(all_html)
    #
    #
    #
    #
    result_result = {"province": "sc", "type": 0, "html": all_html, "keyword": keyword, "companyName": companyName,
                     "yearList": yearList}
    # 结果一
    #
    #
    #
    tables = re.findall('<table[\s\S]+?</table>', html_1)
    for i in tables:
        if '基本信息' in i:
            str1 = table.basic(i)
    temp_html = html_1
    tables = re.findall('<table[\s\S]+?</table>', temp_html)
    list_list = re.findall('<td align="center" style="width:20%;text-align:left;">([\s\S]+?)/td>', html_2)
    str3 = ''
    for j in range(0, len(list_list), 2):
        k = 0
        str3_1 = '{"name":"%s","position":"%s","sex":""}' % (
        list_list[j].replace('<', ''), list_list[j + 1].replace('<', ''))
        str3 = str3_1 + ',\n' + str3
    str3 = '"personList":[%s]' % str3.rstrip('\n').rstrip(',')
    str4 = '"punishBreakList":[]'  # 失信被执行人信息
    str5 = '"punishedList":[]'  # 被执行人信息
    str6 = '"alidebtList":[]'  # 阿里欠贷信息
    str8 = '"frinvList":[]'  # 法定代表人对外投资信息
    str9 = '"frPositionList":[]'  # 法定代表人在其他企业任职信息
    str10 = ""
    detail10 = re.findall('<tr width="95%" id="tr_bg_\d+" name="bg">([\s\S]+?)</tr>', html_1)
    for i in range(len(detail10)):
        detail101 = detail10[i].replace('\r', '').replace('\n', '').replace('\t', '')
        # print detail101
        altltem = re.findall('<td width="15%">(.+?)</td>', detail101)[0]
        altDate = time_time(re.findall('<td width="10%" style="text-align:center;">(.+?)</td>', detail101)[0])
        result = re.findall('<td width="25%">[\s\S]+?</td>', detail101)
        if '<span style="width: 100%;">' in result[0]:
            altBe = re.findall('<span style="width: 100%;">([\s\S]+?)</span>', result[0])[0]
        else:
            altBe = re.findall('<span id="beforeMore\d+_\d+" style="display:none;width:100%;">(.+?)<br/>', result[0])[0]
        if '<span style="width: 100%;">' in result[1]:
            altAf = re.findall('<span style="width: 100%;">([\s\S]+?)</span>', result[1])[0]
        else:
            altAf = re.findall('<span id="beforeMore\d+_\d+" style="display:none;width:100%;">(.+?)<br/>', result[1])[0]
        str10_1 = '{"altDate":"%s","altltem":"%s","altBe":"%s","altAf":"%s"}' % (altDate, altltem, altBe, altAf)
        str10 = str10_1 + ',\n' + str10
    str10 = '"alterList":[%s]' % str10.rstrip('\n').rstrip(',')
    str11 = ''
    detail11 = re.findall('<tr name="fr2" id="tr_fr2_\d+">([\s\S]+?)</tr>', html_2)
    for i in range(len(detail11)):
        detail_list = re.findall('<td style="text-align:left;">(.+?)</td>', detail11[i])
        str11_1 = '{"brName":"%s","brRegno":"%s","brPrincipal":"","cbultem":"","brAddr":""}' % (
            detail_list[1].decode('gbk'), detail_list[0].replace('\n', '').replace('\r', '').decode('gbk'))
        str11 = str11_1 + ',\n' + str11
    str11 = '"filiationList":[%s]' % str11.rstrip('\n').rstrip(',')
    str12 = '"caselnfoList":[]'
    str13 = '"sharesFrostList":[]'
    str14 = ""
    soup3 = BeautifulSoup(html_4, "html.parser")
    no = soup3.find_all(id=re.compile('tr_gq_\d+'))
    if len(no) == 0:
        str14 = ""
    else:
        for i in no:
            text = i.find_all('td')
            str14_1 = '{"impoRg":"%s","impoRgtype":"%s","impAm":"%s","imponrecDate":"","impExaeep":"","impSanDate":"%s","impTo":""}' % (
                string_set(text[5].string), string_set(text[2].string), string_set(text[4].string),
                time_time(text[7].string))
            str14 = str14_1 + ',\n' + str14
    str14 = '"shareslmpawnList":[%s]' % str14.rstrip('\n').rstrip(',')
    str15 = '"morDetailList":[]'
    str16 = '"morgualnfoList":[]'
    soup2 = BeautifulSoup(html_2, "html.parser")
    no2 = soup2.find_all('td', colspan="4")
    if len(no2) == 0:
        str17 = '"liquidationList":[]'
    else:
        liquidationList = '{"ligentity":"","ligprincipal":"%s","liqMen":"%s",liGst:"","ligEndDate":"","debtTranee":"","claimTranee":""}' % (
            string_set(no2[0].string), string_set(no2[1].string))
        str17 = str17 = '"liquidationList":%s' % liquidationList
    # print str11
    result1 = '{%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s}' % (
    str3, str4, str5, str6, str8, str9, str10, str11, str12, str13, str14, str15, str16, str17)
    result1 = eval(result1)
    result1['report_year'] = report_year
    result1['basicList'] = str1
    # print result
    return (result_result, result1)


def search(word):
    html = get_html_list(word)
    if html:
        result = get_source(html, word)
        print json.dumps(result)
        return result
    else:
        open(path + '/research.txt', 'a').write(str(word) + '\n')
        return word


if __name__ == '__main__':
    print json.dumps(search(u'泰康人寿保险股份有限公司泸州中心支公司')[1], ensure_ascii=False, indent=4)
